# Load the libraries
library(tidyverse)
library(plotly)
library(broom)
library(knitr)
This mini-project demands a good understanding of the statistical test. For more information about statistical testing, please refer to this link. In this mini-project, the task is to analyze gapminder_clean.csvdataset using Rand tidyverse. For this mini-project, the p-value decision rule is set to be \(\alpha = 0.05\).
Complete the following analysis in R and generate a RMarkdownreport to show the analysis and results:
Read in the gapminder_clean.csv data as a tibble using read_csv.
gapminder <- read_csv("gapminder_clean.csv")
# Rename column names of gapminder dataset.
gapminder <- gapminder %>% rename(co2_emission = `CO2 emissions (metric tons per capita)`, country_name =`Country Name`,
energy_use = `Energy use (kg of oil equivalent per capita)`, imports_services = `Imports of goods and services (% of GDP)`,
population_density = `Population density (people per sq. km of land area)`,
life_expectancy = `Life expectancy at birth, total (years)`)
Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing CO2 emissions (metric tons per capita) and gdpPercap for the filtered data.
# Filter gapminder dataset with year 1962 and scatterplot (in logarithmic scale) comparing CO2 emissions and gdpPercap
gapminder %>%
filter(Year == 1962) %>%
ggplot(aes(x = gdpPercap, y = co2_emission)) +
geom_point(alpha = 0.3) + scale_x_log10(name = "GDP per capita (log10)") +
scale_y_log10(name = "CO2 emissions metric tons per capita (log10)") + labs(title="CO2 emissions vs GDP in 1962 (Logarithmic Scale)") + theme_classic()
On the filtered data, calculate the pearson correlation of CO2 emissions (metric tons per capita) and gdpPercap. What is the Pearson R value and associated p value?
# Filter the data
gapminder_1962 <- gapminder %>%
filter(Year == 1962)
# Correlation test
test_result <- tidy(cor.test(gapminder_1962$co2_emission,gapminder_1962$gdpPercap))
test_result
## # A tibble: 1 × 8
## estimate statistic p.value parameter conf.low conf.high method alternative
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr> <chr>
## 1 0.926 25.3 1.13e-46 106 0.893 0.949 Pearson'… two.sided
From the table above, the correlation between CO2 emissions and gdpPercap is 0.9260817 and the associated p value is 1.1286792^{-46}.
On the unfiltered data, answer “In what year is the correlation between CO2 emissions (metric tons per capita) and gdpPercap the strongest?” Filter the dataset to that year for the next step…
# Correlation for all years
gap_cor <- gapminder %>%
group_by(Year) %>%
summarize(correlation = cor(co2_emission, gdpPercap, use = "complete.obs")) %>%
arrange(desc(correlation))
gap_cor
## # A tibble: 10 × 2
## Year correlation
## <dbl> <dbl>
## 1 1967 0.939
## 2 1962 0.926
## 3 1972 0.843
## 4 1982 0.817
## 5 1987 0.810
## 6 1992 0.809
## 7 1997 0.808
## 8 2002 0.801
## 9 1977 0.793
## 10 2007 0.720
From the table above, the correlation between CO2 emissions and gdpPercap is the strongest at year 2007 with a correlation value of 0.9387918.
Using plotly, create an interative scatter plot comparing CO2 emissions (metric tons per capita) and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot to a plotly plot using the ggplotly() command.
# Scatterplot for year with strongest correlation between CO2 emissions and gdpPercap
gapminder_scatter_P1Q2 <- gapminder %>%
filter(Year == 1967, !is.na(continent)) %>%
ggplot(aes(x = gdpPercap, y = co2_emission, size = pop, color = continent)) + geom_point() + scale_x_log10(name = "GDP per capita (log10)") +
scale_y_log10("CO2 emissions metric tons per capita (log10)") + guides(color = guide_legend(order=2), size = guide_legend(order = 1)) +
labs(title="CO2 emissions vs GDP in 1962 (Logarithmic Scale)") + theme_classic()
ggplotly(gapminder_scatter_P1Q2, width = 800, height = 500)
Now, without further guidance, use your R Data Science skills (and appropriate statistical tests) to answer the following:
What is the relationship between continent and 'Energy use (kg of oil equivalent per capita)' ? (stats test needed)
# Filter the continent column to avoid missing values
gapminder_continent_filter <- gapminder %>%
filter(!is.na(continent))
# Boxplot comparing energy use between continents
ggplot(gapminder_continent_filter, aes(x = continent, y = energy_use)) + geom_boxplot() + labs(title="Relationships between Energy use and Continent") +
scale_x_discrete(name = 'Continent') + scale_y_continuous(name = 'Energy use (kg of oil equivalent per capita)') + theme_classic()
The boxplot above describes the difference in energy use for different continents. We first use one-way ANOVA test to verify whether this difference is real or it is due to random noise like sampling error.
# p-value outcome from one-way ANOVA test
aov_test_result <- tidy(aov(energy_use~continent, data = gapminder_continent_filter))
aov_test_result
## # A tibble: 2 × 6
## term df sumsq meansq statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 continent 4 771482483. 192870621. 51.5 8.53e-39
## 2 Residuals 843 3159591816. 3748033. NA NA
From the table above, the p-value for this test is 8.5270035^{-39} and thus it is statistically significant.
one-way ANOVA test only tells us whether there is a significant difference between continents concerning energy usage, but we do not know where these differences occur. To further verify this, we need to perform a post-hoc test, for instance, Tukey’s test to check statistical significance for all possible combinations of two out of all continents.
# Tukey test
tukey_test <- TukeyHSD(aov(energy_use~as.factor(continent), data = gapminder_continent_filter))
# Filter the combinations with p-value >0.05
filtered_tukey_test <- tidy(tukey_test) %>% filter(adj.p.value >= 0.05)
tukey_test
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = energy_use ~ as.factor(continent), data = gapminder_continent_filter)
##
## $`as.factor(continent)`
## diff lwr upr p adj
## Americas-Africa 1005.1037 466.8326 1543.3748 0.0000041
## Asia-Africa 1168.7636 628.2529 1709.2742 0.0000000
## Europe-Africa 2447.5453 1947.3838 2947.7067 0.0000000
## Oceania-Africa 3281.7976 2040.3410 4523.2543 0.0000000
## Asia-Americas 163.6599 -384.4160 711.7357 0.9256447
## Europe-Americas 1442.4416 934.1141 1950.7691 0.0000000
## Oceania-Americas 2276.6940 1031.9249 3521.4630 0.0000069
## Europe-Asia 1278.7817 768.0833 1789.4801 0.0000000
## Oceania-Asia 2113.0341 867.2950 3358.7732 0.0000402
## Oceania-Europe 834.2524 -394.5176 2063.0223 0.3421942
From the table above, we can see that only Asia-Americas, Oceania-Europe combinations are not statistically significant whereas the others do.
Is there a significant difference between Europe and Asia with respect to 'Imports of goods and services (% of GDP)' in the years after 1990? (stats test needed)
# Filter Asia and Europe out from continent column
gapminder_Asia_Europe <- gapminder %>% filter(continent %in% c('Europe','Asia'))
# Perform t test
t_test <- t.test(gapminder_Asia_Europe$imports_services~gapminder_Asia_Europe$continent)
df_t_test <- tidy(t_test)
t_test
##
## Welch Two Sample t-test
##
## data: gapminder_Asia_Europe$imports_services by gapminder_Asia_Europe$continent
## t = 1.9982, df = 281.72, p-value = 0.04665
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
## 0.079458 10.573429
## sample estimates:
## mean in group Asia mean in group Europe
## 40.36486 35.03841
For this question, I performed two-sided unpaired t-test. From the table, the resulting p-value is 0.0466528. So I conclude that the difference is statistically significant.
What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)
# gapminder pop density ranking by country across all years
gapminder_ranking_by_year <- gapminder %>%
group_by(Year) %>%
arrange(desc(population_density)) %>%
mutate(ranking = row_number())
# gapminder average pop density ranking by country across all years
gapminder_avg_rank <- gapminder_ranking_by_year %>%
group_by(country_name) %>%
summarize(avg_rank = mean(ranking)) %>%
arrange(avg_rank)
# countr/countries with best average pop density ranking
best_rank <- gapminder_avg_rank %>%
filter(avg_rank == min(avg_rank))
gapminder_avg_rank
## # A tibble: 263 × 2
## country_name avg_rank
## <chr> <dbl>
## 1 Macao SAR, China 1.5
## 2 Monaco 1.5
## 3 Hong Kong SAR, China 3.1
## 4 Singapore 3.9
## 5 Gibraltar 5
## 6 Bermuda 6.2
## 7 Malta 7
## 8 Bangladesh 9.2
## 9 Channel Islands 9.4
## 10 Maldives 10.7
## # … with 253 more rows
From the table above, Macao SAR, China, Monaco has the highest average ranking of 1.5 (lower is better) in population density across all years.
What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' since 1962?
# life expectancy increase by country across all years
gapminder_overall_lifeExp_increase <- gapminder %>%
group_by(country_name) %>%
summarize(overall_increase = tail(life_expectancy, n = 1) - head(life_expectancy, n = 1)) %>%
filter(!is.na(overall_increase)) %>%
arrange(desc(overall_increase))
# country/countries with highest life expectancy increase
gapminder_greatest_lifeExp_increase <- gapminder_overall_lifeExp_increase %>%
filter(overall_increase == max(overall_increase))
gapminder_overall_lifeExp_increase
## # A tibble: 237 × 2
## country_name overall_increase
## <chr> <dbl>
## 1 Maldives 36.9
## 2 Bhutan 33.2
## 3 Timor-Leste 31.1
## 4 Tunisia 30.9
## 5 Oman 30.8
## 6 Nepal 30.6
## 7 China 29.9
## 8 Yemen, Rep. 27.2
## 9 Saudi Arabia 26.7
## 10 Iran, Islamic Rep. 26.6
## # … with 227 more rows
From the table above, Maldives showed the greatest increase of 36.9161463 in life expectancy since 1962.